############################################################################
#Gijs Schumacher, Martijn Schoonvelde, Tanushree Dahiya, Erik de Vries         
#
#This script generates two files from all (English and Translated) speeches:
#(1) "super.dtm" which is a document-term matrix of all speeches;
#(2) "supercorpus", which contains for all speeches their text and metadata
############################################################################

rm(list=ls())
library(stringr)
library(quanteda)

setwd("Current Working directory")

#NB: make sure that the current working directory contains the unzipped speeches file and the remove_words.csv file

files <- list.files("",pattern="*.RData", full.names=TRUE)

for(i in 1:length(files)){
  load(files[i])
  if(i==1){supercorpus <- data.subset}
  if(i>1 & i<15){supercorpus <- supercorpus + data.subset}
  if(i>14){
    data$documents[[which(is.na(names(data$documents)))]] <- NULL
    supercorpus <- supercorpus + data}
  }

ignorewords <- read.csv("/biglist.csv", header=FALSE,sep=";")
ignorewords <- ignorewords[-which(ignorewords[,2]==1),-2]  
ignorewords <- as.character(ignorewords)
ignorewords <- c(ignorewords,"lei", "agus", "oifig", "bhfeidhm", "hivaid", "bheidh","ag", "mullaigh", "beidh", "mban")

super.dtm <- dfm(supercorpus, ignoredFeatures = c(stopwords("english"), ignorewords))
super.dtm <- super.dtm[,-which(str_detect(features(super.dtm), "�|�|�|�|�"))]
super.dtm <- super.dtm[,which(colSums(super.dtm)>1)] 
super.dtm <- super.dtm[,which(nchar(features(super.dtm))>1)]   

save(super.dtm, file="/Single English dtm/super.dtm.RData")
save(supercorpus, file="/Single English dtm/supercorpus.RData")

